#!/usr/bin/env xonsh
from tqdm import tqdm
from tzutil.extract import extract_li
import msgpack
from os.path import exists,join,dirname,abspath

PWD = dirname(abspath(__file__))
cd @(PWD)

PAGE = join(PWD,"bidding.csg.cn.page")

def load(filepath):
  if exists(filepath):
    with open(filepath,"rb") as f:
      return msgpack.load(f)

def dump(filepath, value):
  with open(filepath,"wb") as f:
    msgpack.dump(value, f)

mkdir -p index
mkdir -p doc

exist = set()
page = load(PAGE) or 1

for page in tqdm(range(page,30710)):
  filename = f"index/{page}.html"
  wget -c f"http://www.bidding.csg.cn/zbcg/index_{page}.jhtml" -O @(filename)
  with open(filename,errors="surrogateescape") as f:
    txt = f.read()
    for i in sorted(set(extract_li('href="/zbhxrgs/','.jhtml', txt))):
      if i.isdigit():
        outdir = f"doc/{i[:-4]}"
        if not exists(outdir):
          mkdir -p @(outdir)
        i = int(i)
        if i not in exist:
          exist.add(i)
          wget -c http://www.bidding.csg.cn/zbhxrgs/@(i).jhtml -O @(outdir)/@(i).html
  rm -rf @(filename)
  dump(PAGE, page)
